In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np

visualizing the data¶

In [5]:
df=pd.read_csv(r"C:\Users\Pratiksha Bargal\Downloads\diabetes.csv")
In [6]:
df.head()
Out[6]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [7]:
df.tail()
Out[7]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [9]:
df.isnull().sum()
Out[9]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [10]:
df["Glucose"]=df["Glucose"].replace(0,np.nan)
df["BloodPressure"]=df["BloodPressure"].replace(0,np.nan)
df["SkinThickness"]=df["SkinThickness"].replace(0,np.nan)
df["Insulin"]=df["Insulin"].replace(0,np.nan)
df["BMI"]=df["BMI"].replace(0,np.nan)
In [11]:
df.head()
Out[11]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148.0 72.0 35.0 NaN 33.6 0.627 50 1
1 1 85.0 66.0 29.0 NaN 26.6 0.351 31 0
2 8 183.0 64.0 NaN NaN 23.3 0.672 32 1
3 1 89.0 66.0 23.0 94.0 28.1 0.167 21 0
4 0 137.0 40.0 35.0 168.0 43.1 2.288 33 1
In [12]:
df.describe()
Out[12]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 763.000000 733.000000 541.000000 394.000000 757.000000 768.000000 768.000000 768.000000
mean 3.845052 121.686763 72.405184 29.153420 155.548223 32.457464 0.471876 33.240885 0.348958
std 3.369578 30.535641 12.382158 10.476982 118.775855 6.924988 0.331329 11.760232 0.476951
min 0.000000 44.000000 24.000000 7.000000 14.000000 18.200000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 64.000000 22.000000 76.250000 27.500000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 29.000000 125.000000 32.300000 0.372500 29.000000 0.000000
75% 6.000000 141.000000 80.000000 36.000000 190.000000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

Data Cleaning¶

In [13]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
imputed_array = imputer.fit_transform(df)
imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
df = imputed_df 

Data in a pictorial format¶

In [14]:
plt.figure(figsize=(5,5))

plt.pie(df['Outcome'].value_counts(),labels=['Non-diabetic','Diabetic'],radius=1,
        autopct='%1.1f%%',labeldistance=1.15)

plt.legend(title = 'Outcome:',loc='upper right', bbox_to_anchor=(1.6,1))
plt.show()
In [15]:
plt.figure(figsize=(14, 8))
sns.boxplot(df)
plt.title("the columns before handling outliers")
plt.show()
In [16]:
for col in df:
    # Calculate IQR and identify potential outliers
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Clip the values to the lower and upper bounds
    clipped_values = np.clip(df[col], lower_bound, upper_bound)

    # Assign the clipped values back to the DataFrame
    df[col] = clipped_values

plt.figure(figsize=(14, 6))
sns.boxplot(data=df)
plt.title("the columns after handling outliers")
plt.show()
In [17]:
df.hist(bins=50, figsize=(20,15));
In [18]:
#correlation map
f,ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax);
In [19]:
sns.scatterplot(x='Glucose', y='Outcome', data=df)
plt.title('Scatter Plot between Glucose and Outcome')
plt.xlabel('Glucose')
plt.ylabel('Outcome')
plt.show()
In [20]:
sns.boxplot(x='Outcome', y='Glucose', data=df)
plt.title('Boxplot of Glucose by Outcome')
plt.xlabel('Outcome')
plt.ylabel('Glucose')
plt.show()
In [21]:
# Glucose Distribution
hist = px.histogram(data_frame=df, x='Glucose', color='Outcome', title="Glucose Distribution", height=500,color_discrete_map={0: 'black', 1: 'orange'})
hist.update_layout({'title':{'x':0.5}})
hist.show();
In [22]:
# Insulin Distribution
import plotly.express as px
hist = px.histogram(data_frame=df, x="Insulin", color='Outcome', title="Insulin Distribution", height=500,color_discrete_map={0: 'black', 1: 'orange'})
hist.update_layout({'title':{'x':0.5}})
hist.show();
In [23]:
# BMI Distribution
import plotly.express as px
hist = px.histogram(data_frame=df, x="BMI", color='Outcome', title="BMI Distribution", height=500,color_discrete_map={0: 'black', 1: 'orange'})
hist.update_layout({'title':{'x':0.5}})
hist.show();

modeling¶

In [24]:
X = df.drop("Outcome", axis = 1)
y = df["Outcome"]

Training and testing¶

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = .2, random_state=0)

Accuracy of a model¶

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

models = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    SVC(kernel='linear', C=1),
    KNeighborsClassifier(n_neighbors=3),
    LogisticRegression(random_state=42, max_iter=1000)  # Addressed convergence warning
]

# Initialize an empty DataFrame to store results
results = pd.DataFrame(columns=["Model", "Train Accuracy", "Test Accuracy"])

# Train and evaluate each model, storing results in the DataFrame
for model in models:
    name = model.__class__.__name__  # Access model name
    model.fit(X_train, y_train)  # Fit the model directly on the original training set
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_train_pred)  # Calculate accuracy using original y_train
    test_accuracy = accuracy_score(y_test, y_test_pred)
    results.loc[len(results.index)] = {"Model": name, "Train Accuracy": train_accuracy, "Test Accuracy": test_accuracy}
    print("{} Accuracy: {:.2f}%".format(name, test_accuracy * 100))
    print("{} Classification Report:\n{}".format(name, classification_report(y_test, y_test_pred)))
    print("\n" + "="*50 + "\n")

# Print the results DataFrame
print(results)
RandomForestClassifier Accuracy: 77.27%
RandomForestClassifier Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.84      0.84       107
         1.0       0.63      0.62      0.62        47

    accuracy                           0.77       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.77      0.77      0.77       154


==================================================

SVC Accuracy: 80.52%
SVC Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.92      0.87       107
         1.0       0.74      0.55      0.63        47

    accuracy                           0.81       154
   macro avg       0.78      0.73      0.75       154
weighted avg       0.80      0.81      0.80       154


==================================================

KNeighborsClassifier Accuracy: 69.48%
KNeighborsClassifier Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.80      0.79       107
         1.0       0.50      0.45      0.47        47

    accuracy                           0.69       154
   macro avg       0.63      0.63      0.63       154
weighted avg       0.69      0.69      0.69       154


==================================================

LogisticRegression Accuracy: 79.87%
LogisticRegression Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.90      0.86       107
         1.0       0.71      0.57      0.64        47

    accuracy                           0.80       154
   macro avg       0.77      0.74      0.75       154
weighted avg       0.79      0.80      0.79       154


==================================================

                    Model  Train Accuracy  Test Accuracy
0  RandomForestClassifier        1.000000       0.772727
1                     SVC        0.762215       0.805195
2    KNeighborsClassifier        0.838762       0.694805
3      LogisticRegression        0.768730       0.798701
In [ ]: